knitr::opts_chunk$set(
warning = FALSE, message = FALSE,
fig.width = 10, fig.height = 8
)
library(tidyverse)
library(magrittr)
library(limma)
library(edgeR)
library(ggrepel)
library(ggplot2)
library(pheatmap)
library(AnnotationHub)
library(ensembldb)
# library(plotly)
# library(RColorBrewer)
# library(clusterProfiler)
# library(org.Mm.eg.db)
# library(KEGGREST)
# library(fgsea)
# library(data.table)
library(scales)
library(broom)
library(glue)
library(pander)
theme_set(
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
)
counts <- here::here("data", "genes.out.gz") %>%
read_tsv() %>%
rename_all(basename) %>%
rename_all(str_remove_all, pattern = "Aligned.sortedByCoord.out.bam")
samples <- tibble(
sample = setdiff(colnames(counts), "Geneid"),
condition = gsub("M2_|M3_|M5_|M6_", "", sample) %>%
factor(levels = c("DN", "DP", "LAG_3", "CD49b")),
mouse = str_extract(sample, "M[0-9]"),
label = glue("{condition} ({mouse})")
)
dgeList <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
as.matrix() %>%
DGEList(
samples = samples
) %>%
calcNormFactors(method = "TMM")
ah <- AnnotationHub()
#Find which genome did we use
# unique(ah$dataprovider)
# subset(ah, dataprovider == "Ensembl") %>% #In Ensembl databse
# subset(species == "Mus musculus") %>% #under Mouse
# subset(rdataclass == "EnsDb")
ensDb <- ah[["AH69210"]] #This is the genome used for assigning reads to genes
genes <- genes(ensDb) %>% #extract the genes
subset(seqnames %in% c(1:19, "MT", "X", "Y")) %>%
keepStandardChromosomes() %>%
sortSeqlevels()
dgeList$genes <- data.frame(gene_id = rownames(dgeList)) %>%
left_join(
mcols(genes) %>%
as.data.frame() %>%
dplyr::select(gene_id, gene_name, gene_biotype, description, entrezid),
by = "gene_id"
)
dgeList$samples %>%
mutate(CellType = dgeList$samples$condition) %>%
ggplot(aes(x = label, y = lib.size / 1e6, fill = CellType)) +
geom_col() +
geom_hline(
yintercept = mean(dgeList$samples$lib.size / 1e6),
linetype = 2, colour = "grey30"
) +
labs(x = "Sample", y = "Library Size (millions)") +
facet_wrap(~mouse, scales = "free_x", nrow = 1) +
scale_x_discrete(labels = label_wrap(5)) +
scale_y_continuous(expand = expansion(c(0, 0.05)))
Figure S7a Library sizes for all libraries after summarisation to gene-level counts. The mean library size across all libraries is shown as a dashed horizontal line.
genes2keep <- dgeList %>%
cpm(log = TRUE) %>%
is_greater_than(1) %>%
rowSums() %>%
is_greater_than(4)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE]
Genes were only retained in the final dataset if 4 or more samples returned \(>1\) log2 Counts per Million (logCPM). The gave a dataset of 11,117 of the initial 55,450 genes which were retained for downstream analysis.
dgeFilt %>%
cpm(log = TRUE) %>%
as_tibble(rownames = "gene_id") %>%
pivot_longer(cols = all_of(colnames(dgeFilt)), names_to = "sample", values_to = "logCPM") %>%
left_join(dgeFilt$samples, by = "sample") %>%
ggplot(aes(logCPM, y = stat(density), colour = condition, group = sample)) +
geom_density() +
facet_wrap(~mouse) +
scale_y_continuous(expand = expansion(c(0.01, 0.05)))
logCPM densisties after removal of undetectable genes. The double negative (DN) samples for both m% and M6 appear to skew to lower overall counts compared to all other samples, with a handful of highly expressed genes likely to dominate the sample.
pca <- dgeFilt %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
pca %>%
broom::tidy() %>%
dplyr::rename(sample = row) %>%
dplyr::filter(PC %in% c(1, 2)) %>%
pivot_wider(names_from = "PC", names_prefix = "PC", values_from = "value") %>%
left_join(dgeFilt$samples, by = "sample") %>%
ggplot(aes(PC1, PC2, colour = condition)) +
geom_point(size = 3) +
geom_text_repel(
aes(label = label),#str_replace_all(label, " ", "\n")),
max.overlaps = Inf
) +
labs(
x = glue("PC1 ({percent(summary(pca)$importance[2, 'PC1'])})"),
y = glue("PC2 ({percent(summary(pca)$importance[2, 'PC2'])})"),
colour = "Cell Type"
)
Figure S7b PCA on logCPM values, with the two DN samples identified above clearly showing strong divergence from the remainder of the dataset. The CD49b sample fro M2 also appeared slightly divergent, with the previous density plot also showing a slght skew towards lower overall counts.
The above PCA revealed some potential problems with two of the four DN samples. Exclusion of the clear outliers will reduce the number of viable samples within the DN group to 2 and as an alternative, a weighting strategy was instead sought for all samples.
U <- matrix(1, nrow = ncol(dgeFilt)) %>%
set_colnames("Intercept")
v <- voomWithQualityWeights(dgeFilt, design = U)
X <- model.matrix(~0 + condition, data = dgeFilt$samples) %>%
set_colnames(str_remove(colnames(.), "condition"))
rho <- duplicateCorrelation(v, X, block=dgeFilt$samples$mouse)$consensus.correlation
v <- voomWithQualityWeights(
counts = dgeFilt, design = U, block=dgeFilt$samples$mouse, correlation=rho
)
Sample-level weights were estimated by assuming all samples were
drawn from the same group and running
voomWithQualityWeights(). After running this, samples were
compared within each mouse-of-origin and correlations within mice were
estimated using duplicateCorrelation() (\(\rho=\) 0.118).
voomWithQualityWeights() was then run again setting mouse
as the blocking variable and including the consensus correlations
v$targets %>%
ggplot(aes(label, sample.weights, fill = condition)) +
geom_col() +
geom_hline(yintercept = 1, linetype = 2, col = "grey30") +
facet_wrap(~mouse, nrow = 1, scales = "free_x") +
scale_x_discrete(labels = scales::label_wrap(5)) +
scale_y_continuous(expand = expansion(c(0, 0.05))) +
labs(
x = "Sample", y = "Sample Weights", fill = "Cell Type"
)
Sample-level weights after running
voomWithQualityWeights setting all samples as being drawn
from the same condition. The ideal equal wweighting of 1 is shown as the
dashed horizontal line, with those samples below this being assumed to
be of lower quality than those above the line. Thw two previously
identified DN samples were strongly down-weighted, as was the CD49b
sample from M2
cont.matrix <- makeContrasts(
DNvLAG3 = DN - LAG_3,
DNvDP = DN - DP,
DNvCD49b = DN - CD49b,
LAG3vCD49b = LAG_3 - CD49b,
CD49bvDP = CD49b - DP,
LAG3vDP = LAG_3 - DP,
levels = X
)
fit <- lmFit(v, design = X, block = v$targets$mouse, correlation = rho) %>%
contrasts.fit(cont.matrix) %>%
# treat()
eBayes()
top_tables <- colnames(cont.matrix) %>%
lapply(function(x) topTable(fit, coef = x, number = Inf)) %>%
# lapply(function(x) topTreat(fit, coef = x, number = Inf)) %>%
lapply(as_tibble) %>%
lapply(mutate, DE = adj.P.Val < 0.05 & abs(logFC) > 1) %>%
setNames(colnames(cont.matrix))
top_tables %>%
lapply(
function(x){
df <- dplyr::filter(x,DE)
tibble(
Up = sum(df$logFC > 0),
Down = sum(df$logFC < 0),
`Total DE` = Up + Down
)
}
) %>%
bind_rows(.id = "Comparison") %>%
pander(
justify = "lrrr",
caption = "Results from each comparison, where genes are considered DE using an FDR < 0.05 along with an estimated logFC beyond $\\pm1$."
)
| Comparison | Up | Down | Total DE |
|---|---|---|---|
| DNvLAG3 | 46 | 31 | 77 |
| DNvDP | 27 | 34 | 61 |
| DNvCD49b | 16 | 2 | 18 |
| LAG3vCD49b | 111 | 56 | 167 |
| CD49bvDP | 6 | 32 | 38 |
| LAG3vDP | 16 | 17 | 33 |
top_tables$DNvLAG3 %>%
ggplot(aes(AveExpr, logFC)) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_smooth(se = FALSE) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC > 0) %>%
arrange(desc(logFC)) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC < 0) %>%
arrange(logFC) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("DN Vs. LAG3") +
scale_colour_manual(values = c("grey30", "red"))
MA-Plot for DN Vs. LAG3. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$DNvDP %>%
ggplot(aes(AveExpr, logFC)) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_smooth(se = FALSE) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC > 0) %>%
arrange(desc(logFC)) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC < 0) %>%
arrange(logFC) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("DN Vs. DP") +
scale_colour_manual(values = c("grey30", "red"))
MA-Plot for DN Vs. DP. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$DNvCD49b %>%
ggplot(aes(AveExpr, logFC)) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_smooth(se = FALSE) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC > 0) %>%
arrange(desc(logFC)) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC < 0) %>%
arrange(logFC) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("DN Vs. CD49b") +
scale_colour_manual(values = c("grey30", "red"))
MA-Plot for DN Vs. CD49b The most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$LAG3vCD49b %>%
ggplot(aes(AveExpr, logFC)) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_smooth(se = FALSE) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC > 0) %>%
arrange(desc(logFC)) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC < 0) %>%
arrange(logFC) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("LAG3 Vs. CD49b") +
scale_colour_manual(values = c("grey30", "red"))
MA-Plot for LAG3 Vs. CD49b. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$LAG3vDP %>%
ggplot(aes(AveExpr, logFC)) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_smooth(se = FALSE) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC > 0) %>%
arrange(desc(logFC)) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC < 0) %>%
arrange(logFC) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("LAG3 Vs. DP") +
scale_colour_manual(values = c("grey30", "red"))
MA-Plot for LAG3 Vs. DP The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$CD49bvDP %>%
ggplot(aes(AveExpr, logFC)) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_smooth(se = FALSE) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC > 0) %>%
arrange(desc(logFC)) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE, logFC < 0) %>%
arrange(logFC) %>%
dplyr::slice(1:5),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("CD49b Vs. DP") +
scale_colour_manual(values = c("grey30", "red"))
MA-Plot for CD49b Vs. DP. The 5 most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$DNvLAG3 %>%
ggplot(aes(logFC, -log10(P.Value))) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE) %>%
arrange(P.Value) %>%
dplyr::slice(1:10),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("DN Vs. LAG3") +
scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for DN Vs. LAG3. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.
top_tables$DNvDP %>%
ggplot(aes(logFC, -log10(P.Value))) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE) %>%
arrange(P.Value) %>%
dplyr::slice(1:10),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("DN Vs. DP") +
scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for DN Vs. DP. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.
top_tables$DNvCD49b %>%
ggplot(aes(logFC, -log10(P.Value))) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE) %>%
arrange(P.Value) %>%
dplyr::slice(1:10),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("DN Vs. CD49b") +
scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for DN Vs. CD49b The most up/down-regulated genes are labelled, with the blue line representing a spline through the data.
top_tables$LAG3vCD49b %>%
ggplot(aes(logFC, -log10(P.Value))) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE) %>%
arrange(P.Value) %>%
dplyr::slice(1:10),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("LAG3 Vs. CD49b") +
scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for LAG3 Vs. CD49b. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.
top_tables$LAG3vDP %>%
ggplot(aes(logFC, -log10(P.Value))) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE) %>%
arrange(P.Value) %>%
dplyr::slice(1:10),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("LAG3 Vs. DP") +
scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for LAG3 Vs. DP The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.
top_tables$CD49bvDP %>%
ggplot(aes(logFC, -log10(P.Value))) +
geom_point(aes(colour = DE), alpha = 0.6) +
geom_label_repel(
aes(label = gene_name, colour = DE),
data = . %>%
dplyr::filter(DE) %>%
arrange(P.Value) %>%
dplyr::slice(1:10),
max.overlaps = Inf,
show.legend = FALSE
) +
ggtitle("CD49b Vs. DP") +
scale_colour_manual(values = c("grey30", "red"))
Volcano Plot for CD49b Vs. DP. The 10 most highly-ranked genes are labelled, with the blue line representing a spline through the data.